In [1]:
import pandas
from collections import defaultdict

In [2]:
pah = pandas.read_csv('enwiki_pah_misalignment.tsv',sep='\t')

In [3]:
pah.head()


Out[3]:
pageid total_hits assessment_class pop_class dissonance
0 12 49784 GA GA None
1 25 130034 FA FA None
2 39 17751 B B None
3 290 102993 B FA High positive
4 303 62333 B GA Moderate positive

In [4]:
pah.groupby('dissonance').head()


Out[4]:
pageid total_hits assessment_class pop_class dissonance
0 12 49784 GA GA None
1 25 130034 FA FA None
2 39 17751 B B None
3 290 102993 B FA High positive
4 303 62333 B GA Moderate positive
5 305 68077 C A High positive
6 307 235210 GA FA High positive
7 308 118481 B FA High positive
8 324 130784 B FA High positive
9 330 113 Stub Stub None
10 332 1184 Stub Start Moderate positive
11 336 37372 B GA Moderate positive
13 340 2106 C Start Moderate negative
14 344 2611 Start C Moderate positive
17 572 10438 B B None
20 584 5961 Start C Moderate positive
31 621 38624 FA GA High negative
48 663 20906 FA B High negative
51 666 17636 A B High negative
82 742 99 Start Stub Moderate negative
84 748 6067 B C Moderate negative
85 751 45852 FA GA High negative
98 785 9921 GA B Moderate negative
99 786 3888 B C Moderate negative
106 798 7672 FA B High negative

In [5]:
lines = open('english-qid-names2016-03-27.csv','r').readlines()

In [6]:
qidnames = {}
for line in lines:
    qid, ennamen = line.split(',', maxsplit=1)
    enname = ennamen.split('\n')[0]
    if enname:
        qidnames[qid]=enname

In [7]:
import json

In [8]:
json.dump(qidnames, open('qid_enpage.json','w'))

In [9]:
endf = pandas.DataFrame.from_dict(qidnames,orient='index')

In [10]:
len(endf)


Out[10]:
1346812

Todo:

+ map gender-enwiki-page-id
+ dissoance class priors
+ posteriors by gender
+ posterior for no gender.

In [11]:
bigdf = pandas.read_csv('/media/notconfusing/9d9b45fc-55f7-428c-a228-1c4c4a1b728c/home/maximilianklein/snapshot_data/2016-01-03/gender-index-data-2016-01-03.csv')

In [12]:
gender_qid_df = bigdf[['qid','gender']]

In [13]:
def map_gender(x):
    if isinstance(x,float):
        return 'no gender'
    else:
        gen = x.split('|')[0]
        if gen == 'Q6581072':
            return 'female'
        elif gen == 'Q6581097':
            return 'male'
        else:
            return 'nonbin'
gender_qid_df['gender'] = gender_qid_df['gender'].apply(map_gender)


/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [14]:
def qid2enname(x):
    try:
        return qidnames[x]
    except KeyError:
        return None
gender_qid_df['enname'] = gender_qid_df['qid'].apply(qid2enname)


/usr/local/lib/python3.4/dist-packages/ipykernel/__main__.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [15]:
enname_id = pandas.read_csv('/home/notconfusing/workspace/wikidumpparse/wikidump/mediawiki-utilities/enname_id.txt',sep='\t',names=['enname','pageid'])

In [16]:
gender_page_id = pandas.merge(gender_qid_df, enname_id, how='inner',on='enname')

In [17]:
pah_gender = pandas.merge(pah, gender_page_id, how='left', on='pageid')

In [18]:
pah_gender


Out[18]:
pageid total_hits assessment_class pop_class dissonance qid gender enname
0 12 49784 GA GA None NaN NaN NaN
1 25 130034 FA FA None NaN NaN NaN
2 39 17751 B B None NaN NaN NaN
3 290 102993 B FA High positive NaN NaN NaN
4 303 62333 B GA Moderate positive NaN NaN NaN
5 305 68077 C A High positive NaN NaN NaN
6 307 235210 GA FA High positive Q91 male Abraham Lincoln
7 308 118481 B FA High positive Q868 male Aristotle
8 324 130784 B FA High positive NaN NaN NaN
9 330 113 Stub Stub None NaN NaN NaN
10 332 1184 Stub Start Moderate positive NaN NaN NaN
11 336 37372 B GA Moderate positive NaN NaN NaN
12 339 79986 GA FA High positive Q132524 female Ayn Rand
13 340 2106 C Start Moderate negative Q313590 male Alain Connes
14 344 2611 Start C Moderate positive Q959677 male Allan Dwan
15 358 100412 C FA High positive NaN NaN NaN
16 569 82360 C FA High positive NaN NaN NaN
17 572 10438 B B None NaN NaN NaN
18 573 70133 B FA High positive NaN NaN NaN
19 580 9577 Start B High positive NaN NaN NaN
20 584 5961 Start C Moderate positive NaN NaN NaN
21 586 121166 C FA High positive NaN NaN NaN
22 593 51808 C GA High positive NaN NaN NaN
23 594 81903 B FA High positive NaN NaN NaN
24 595 43604 B GA Moderate positive Q7407 male Andre Agassi
25 597 13466 C B Moderate positive NaN NaN NaN
26 599 19356 Start B High positive NaN NaN NaN
27 600 77106 B FA High positive NaN NaN NaN
28 612 22791 C GA High positive NaN NaN NaN
29 615 10122 Start B High positive NaN NaN NaN
... ... ... ... ... ... ... ... ...
3566196 43565014 283 Start Start None NaN NaN NaN
3566197 43571851 242 Start Stub Moderate negative NaN NaN NaN
3566198 43573609 156 Stub Stub None NaN NaN NaN
3566199 43579833 9549 Start B High positive NaN NaN NaN
3566200 43581557 228 Start Stub Moderate negative NaN NaN NaN
3566201 43585233 581 Start Start None NaN NaN NaN
3566202 43591321 70678 B FA High positive NaN NaN NaN
3566203 43593845 706 Stub Start Moderate positive NaN NaN NaN
3566204 43597158 402 Stub Start Moderate positive NaN NaN NaN
3566205 43601561 113 Stub Stub None NaN NaN NaN
3566206 43602987 69 Start Stub Moderate negative NaN NaN NaN
3566207 43603609 3617 C C None NaN NaN NaN
3566208 43603633 619 GA Start High negative NaN NaN NaN
3566209 43603641 4668 Start C Moderate positive NaN NaN NaN
3566210 43605227 2913 Stub C High positive NaN NaN NaN
3566211 43610555 4991 Start C Moderate positive NaN NaN NaN
3566212 43611710 1920 Start Start None NaN NaN NaN
3566213 43614149 265 Stub Start Moderate positive NaN NaN NaN
3566214 43617243 271 Stub Start Moderate positive NaN NaN NaN
3566215 43618039 67 Start Stub Moderate negative NaN NaN NaN
3566216 43618214 45 Stub Stub None NaN NaN NaN
3566217 43618777 312 Start Start None NaN NaN NaN
3566218 43618784 509 Start Start None NaN NaN NaN
3566219 43620318 2431 C C None NaN NaN NaN
3566220 43620339 4386 B C Moderate negative NaN NaN NaN
3566221 43623248 6655 Start B High positive NaN NaN NaN
3566222 43625159 341 Stub Start Moderate positive NaN NaN NaN
3566223 43626507 1385 Start Start None NaN NaN NaN
3566224 43627595 4248 Stub C High positive NaN NaN NaN
3566225 43639346 524 Start Start None NaN NaN NaN

3566226 rows × 8 columns


In [19]:
len(pah), len(gender_page_id), len(pah_gender)


Out[19]:
(3566226, 1135655, 3566226)

Rel risk. P(gender|misaligned)/P(gender)

What proportion of the misaligned dataset is about women?

For each gender, what proportion of the each misalignment group do the represent.


In [20]:
pah_gender['gender'] = pah_gender['gender'].fillna('nonbio')

In [68]:
SE = pah_gender[(pah_gender['dissonance'] == 'Moderate negative') | (pah_gender['dissonance'] == 'High negative')]
NI = pah_gender[(pah_gender['dissonance'] == 'Moderate positive') | (pah_gender['dissonance'] == 'High positive')]
rel_risk = defaultdict(dict)
for risk, risk_name in [(SE,'Spent Effort'), (NI,'Needs Improvement')]:
    for gender in ['female','male','nonbin','nonbio']:
        gen_mis = len(risk[risk['gender'] == gender])
        p_gen_mis = gen_mis/len(risk)               #p(gender|misalignment)
        p_gen = len(pah_gender[pah_gender['gender'] == gender]) / len(pah_gender)   #p(gender)
        print(p_gen_mis, p_gen)
        rel_risk[gender][risk_name] = p_gen_mis/p_gen#rel sirk


0.044507631065275194 0.04304157952973255
0.27188571077263574 0.23647603937607992
1.8443662296543352e-05 3.6733510439327174e-05
0.6834114627361173 0.72027628086386
0.0560191847918616 0.04304157952973255
0.20317777218350697 0.23647603937607992
9.749552902965755e-05 3.6733510439327174e-05
0.7406051416522728 0.72027628086386

In [24]:
java_min_int = -2147483648
allrecs = pandas.read_csv('/media/notconfusing/9d9b45fc-55f7-428c-a228-1c4c4a1b728c/home/maximilianklein/snapshot_data/2016-01-03/gender-index-data-2016-01-03.csv',na_values=[java_min_int])

In [26]:
def sum_column(q_str):
    if type(q_str) is str:
        qs = q_str.split('|')
        return len(qs) #cos the format will always end with a |
for col in ['site_links']:
    allrecs[col] = allrecs[col].apply(sum_column)

In [27]:
allrecs['site_links'].head(20)


Out[27]:
0     189
1      98
2     211
3     122
4      32
5      96
6      78
7      32
8     138
9      38
10     24
11    165
12     61
13    118
14     80
15     23
16     71
17     67
18    237
19     97
Name: site_links, dtype: float64

In [29]:
allrecs['gender'] = allrecs['gender'].apply(map_gender)

In [78]:
sl_risk = defaultdict(dict)
sl_risk['nonbio']['Sitelink Ratio'] = 1
for gender in ['female','male','nonbin']:
    gend_df = allrecs[allrecs['gender']==gender]
    gend_df_size = len(gend_df)
    avg_sl = (gend_df['site_links'].sum() / gend_df_size)  / 2.6
    sl_risk[gender]['Sitelink Ratio'] = avg_sl

In [79]:
sl_risk_df = pandas.DataFrame.from_dict(sl_risk, orient='index')

In [80]:
rel_risk_df = pandas.DataFrame.from_dict(rel_risk,orient="index")

In [81]:
risk_df = pandas.DataFrame.join(sl_risk_df,rel_risk_df)

In [82]:
risk_df.index = ['Female','Male','Non-binary','Non-biography']

In [83]:
print(risk_df.to_latex(columns = ['Needs Improvement','Spent Effort', 'Sitelink Ratio'],float_format=lambda n:'%.2f' %n))


\begin{tabular}{lrrr}
\toprule
{} &  Needs Improvement &  Spent Effort &  Sitelink Ratio \\
\midrule
Female        &               1.30 &          1.03 &            1.29 \\
Male          &               0.86 &          1.15 &            1.29 \\
Non-binary    &               2.65 &          0.50 &            1.46 \\
Non-biography &               1.03 &          0.95 &            1.00 \\
\bottomrule
\end{tabular}